{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "from os.path import dirname, realpath, join\n",
    "base_dir = dirname(dirname(os.getcwd()))\n",
    "\n",
    "import pandas as pd\n",
    "from os.path import join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sys.path.insert(0, base_dir)\n",
    "from config_path import PROSTATE_DATA_PATH, PLOTS_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping_file = join(PROSTATE_DATA_PATH, 'raw_data/sample_mapping.tsv')\n",
    "mapping_ids = pd.read_csv(mapping_file, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "fusions_file = join(PROSTATE_DATA_PATH, 'raw_data/outputs_p1000_n=660_star_fusion.tsv')\n",
    "fusions_data = pd.read_csv(fusions_file, sep='\\t',index_col=1 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "fusions_data_with_proper_id = mapping_ids.join(fusions_data, on='fusion_name', how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "659"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(fusions_data_with_proper_id.index.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_file = join(PROSTATE_DATA_PATH, 'raw_data/outputs_p1000_rna_n=660_tpm_matrix.tsv')\n",
    "tpm_data = pd.read_csv(tmp_file, sep='\\t',index_col=0 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>hgnc_id</th>\n",
       "      <th>TSPAN6</th>\n",
       "      <th>TNMD</th>\n",
       "      <th>DPM1</th>\n",
       "      <th>SCYL3</th>\n",
       "      <th>C1orf112</th>\n",
       "      <th>FGR</th>\n",
       "      <th>CFH</th>\n",
       "      <th>FUCA2</th>\n",
       "      <th>GCLC</th>\n",
       "      <th>NFYA</th>\n",
       "      <th>...</th>\n",
       "      <th>H2BE1</th>\n",
       "      <th>GET1-SH3BGR</th>\n",
       "      <th>SPDYE15</th>\n",
       "      <th>NOTCH2NLB</th>\n",
       "      <th>SPDYE13</th>\n",
       "      <th>ASDURF</th>\n",
       "      <th>SPEGNB</th>\n",
       "      <th>SPDYE14</th>\n",
       "      <th>DERPC</th>\n",
       "      <th>NOTCH2NLC</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>MO_1008-Tumor_Dura_tcap</th>\n",
       "      <td>0.304045</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>25.256696</td>\n",
       "      <td>1.950081</td>\n",
       "      <td>1.100852</td>\n",
       "      <td>2.652530</td>\n",
       "      <td>2.505749</td>\n",
       "      <td>66.774552</td>\n",
       "      <td>55.514406</td>\n",
       "      <td>8.230181</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.472808</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.125812</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.429857</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.828260</td>\n",
       "      <td>0.230655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1012-Tumor-Subcutaneous_nodule_polyA</th>\n",
       "      <td>35.076985</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>31.772289</td>\n",
       "      <td>5.271777</td>\n",
       "      <td>12.038535</td>\n",
       "      <td>1.211722</td>\n",
       "      <td>11.487752</td>\n",
       "      <td>32.275862</td>\n",
       "      <td>18.380403</td>\n",
       "      <td>22.125725</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.942669</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.293129</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.271777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1013-Tumor_polyA</th>\n",
       "      <td>5.051672</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.234353</td>\n",
       "      <td>5.824887</td>\n",
       "      <td>2.697661</td>\n",
       "      <td>17.715216</td>\n",
       "      <td>23.574468</td>\n",
       "      <td>25.550462</td>\n",
       "      <td>55.602754</td>\n",
       "      <td>14.639538</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.738850</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12.354258</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.505242</td>\n",
       "      <td>1.065319</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.244092</td>\n",
       "      <td>0.326469</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1014-Tumor_polyA</th>\n",
       "      <td>33.878848</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>24.040707</td>\n",
       "      <td>4.237129</td>\n",
       "      <td>3.491540</td>\n",
       "      <td>2.491359</td>\n",
       "      <td>15.602819</td>\n",
       "      <td>33.369665</td>\n",
       "      <td>8.874330</td>\n",
       "      <td>9.583549</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>18.239659</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6.273861</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>14.366232</td>\n",
       "      <td>0.909255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1015-Tumor_tcap</th>\n",
       "      <td>11.853928</td>\n",
       "      <td>2.469568</td>\n",
       "      <td>66.280807</td>\n",
       "      <td>16.299152</td>\n",
       "      <td>15.696818</td>\n",
       "      <td>2.963482</td>\n",
       "      <td>79.664663</td>\n",
       "      <td>18.961467</td>\n",
       "      <td>37.139900</td>\n",
       "      <td>21.491269</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.614254</td>\n",
       "      <td>0.0</td>\n",
       "      <td>80.062203</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.939515</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.963482</td>\n",
       "      <td>13.094736</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 19192 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "hgnc_id                                     TSPAN6      TNMD       DPM1  \\\n",
       "MO_1008-Tumor_Dura_tcap                   0.304045  0.000000  25.256696   \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA  35.076985  0.000000  31.772289   \n",
       "MO_1013-Tumor_polyA                       5.051672  0.000000  27.234353   \n",
       "MO_1014-Tumor_polyA                      33.878848  0.000000  24.040707   \n",
       "MO_1015-Tumor_tcap                       11.853928  2.469568  66.280807   \n",
       "\n",
       "hgnc_id                                      SCYL3   C1orf112        FGR  \\\n",
       "MO_1008-Tumor_Dura_tcap                   1.950081   1.100852   2.652530   \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA   5.271777  12.038535   1.211722   \n",
       "MO_1013-Tumor_polyA                       5.824887   2.697661  17.715216   \n",
       "MO_1014-Tumor_polyA                       4.237129   3.491540   2.491359   \n",
       "MO_1015-Tumor_tcap                       16.299152  15.696818   2.963482   \n",
       "\n",
       "hgnc_id                                        CFH      FUCA2       GCLC  \\\n",
       "MO_1008-Tumor_Dura_tcap                   2.505749  66.774552  55.514406   \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA  11.487752  32.275862  18.380403   \n",
       "MO_1013-Tumor_polyA                      23.574468  25.550462  55.602754   \n",
       "MO_1014-Tumor_polyA                      15.602819  33.369665   8.874330   \n",
       "MO_1015-Tumor_tcap                       79.664663  18.961467  37.139900   \n",
       "\n",
       "hgnc_id                                       NFYA    ...      H2BE1  \\\n",
       "MO_1008-Tumor_Dura_tcap                   8.230181    ...        0.0   \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA  22.125725    ...        0.0   \n",
       "MO_1013-Tumor_polyA                      14.639538    ...        0.0   \n",
       "MO_1014-Tumor_polyA                       9.583549    ...        0.0   \n",
       "MO_1015-Tumor_tcap                       21.491269    ...        0.0   \n",
       "\n",
       "hgnc_id                                  GET1-SH3BGR  SPDYE15  NOTCH2NLB  \\\n",
       "MO_1008-Tumor_Dura_tcap                     5.472808      0.0   0.125812   \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA     0.000000      0.0  13.942669   \n",
       "MO_1013-Tumor_polyA                         0.738850      0.0  12.354258   \n",
       "MO_1014-Tumor_polyA                         0.000000      0.0  18.239659   \n",
       "MO_1015-Tumor_tcap                          1.614254      0.0  80.062203   \n",
       "\n",
       "hgnc_id                                  SPDYE13     ASDURF    SPEGNB  \\\n",
       "MO_1008-Tumor_Dura_tcap                      0.0   0.000000  0.429857   \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA      0.0  19.293129  0.000000   \n",
       "MO_1013-Tumor_polyA                          0.0   3.505242  1.065319   \n",
       "MO_1014-Tumor_polyA                          0.0   6.273861  0.000000   \n",
       "MO_1015-Tumor_tcap                           0.0   1.939515  0.000000   \n",
       "\n",
       "hgnc_id                                  SPDYE14      DERPC  NOTCH2NLC  \n",
       "MO_1008-Tumor_Dura_tcap                      0.0   0.828260   0.230655  \n",
       "MO_1012-Tumor-Subcutaneous_nodule_polyA      0.0   0.000000   5.271777  \n",
       "MO_1013-Tumor_polyA                          0.0   4.244092   0.326469  \n",
       "MO_1014-Tumor_polyA                          0.0  14.366232   0.909255  \n",
       "MO_1015-Tumor_tcap                           0.0   2.963482  13.094736  \n",
       "\n",
       "[5 rows x 19192 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpm_data= tpm_data.T\n",
    "tpm_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping_ids_tpm = mapping_ids[['Tumor_Sample_Barcode', 'tpm_col']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tpm_data_with_propoer_id =mapping_ids_tpm.join(tpm_data, on='tpm_col', how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(659, 19193)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpm_data_with_propoer_id =tpm_data_with_propoer_id.drop('tpm_col', axis=1)\n",
    "tpm_data_with_propoer_id.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th>TSPAN6</th>\n",
       "      <th>TNMD</th>\n",
       "      <th>DPM1</th>\n",
       "      <th>SCYL3</th>\n",
       "      <th>C1orf112</th>\n",
       "      <th>FGR</th>\n",
       "      <th>CFH</th>\n",
       "      <th>FUCA2</th>\n",
       "      <th>GCLC</th>\n",
       "      <th>...</th>\n",
       "      <th>H2BE1</th>\n",
       "      <th>GET1-SH3BGR</th>\n",
       "      <th>SPDYE15</th>\n",
       "      <th>NOTCH2NLB</th>\n",
       "      <th>SPDYE13</th>\n",
       "      <th>ASDURF</th>\n",
       "      <th>SPEGNB</th>\n",
       "      <th>SPDYE14</th>\n",
       "      <th>DERPC</th>\n",
       "      <th>NOTCH2NLC</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TCGA-EJ-5499</td>\n",
       "      <td>60.674260</td>\n",
       "      <td>0.048559</td>\n",
       "      <td>41.566360</td>\n",
       "      <td>9.286877</td>\n",
       "      <td>3.156324</td>\n",
       "      <td>2.294405</td>\n",
       "      <td>9.869583</td>\n",
       "      <td>38.640690</td>\n",
       "      <td>27.144387</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.376331</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27.484299</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.144184</td>\n",
       "      <td>0.388471</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.241437</td>\n",
       "      <td>2.063750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MO_1012</td>\n",
       "      <td>35.076985</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>31.772289</td>\n",
       "      <td>5.271777</td>\n",
       "      <td>12.038535</td>\n",
       "      <td>1.211722</td>\n",
       "      <td>11.487752</td>\n",
       "      <td>32.275862</td>\n",
       "      <td>18.380403</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.942669</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.293129</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.271777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TCGA-CH-5752</td>\n",
       "      <td>9.524737</td>\n",
       "      <td>0.046519</td>\n",
       "      <td>40.657484</td>\n",
       "      <td>9.803850</td>\n",
       "      <td>3.895955</td>\n",
       "      <td>1.860754</td>\n",
       "      <td>5.151964</td>\n",
       "      <td>65.905596</td>\n",
       "      <td>19.421624</td>\n",
       "      <td>...</td>\n",
       "      <td>0.093038</td>\n",
       "      <td>0.174446</td>\n",
       "      <td>0.0</td>\n",
       "      <td>15.339594</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.396188</td>\n",
       "      <td>0.779191</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.910072</td>\n",
       "      <td>5.419447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC_9126</td>\n",
       "      <td>9.881191</td>\n",
       "      <td>0.079595</td>\n",
       "      <td>82.233337</td>\n",
       "      <td>8.357509</td>\n",
       "      <td>40.195637</td>\n",
       "      <td>6.958905</td>\n",
       "      <td>17.351780</td>\n",
       "      <td>49.747076</td>\n",
       "      <td>115.811192</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.434596</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.548304</td>\n",
       "      <td>1.250784</td>\n",
       "      <td>0.0</td>\n",
       "      <td>22.582330</td>\n",
       "      <td>0.227415</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>PROS01448-6115227-SM-67ERU</td>\n",
       "      <td>20.259544</td>\n",
       "      <td>0.080769</td>\n",
       "      <td>53.886349</td>\n",
       "      <td>10.190349</td>\n",
       "      <td>6.663438</td>\n",
       "      <td>0.430768</td>\n",
       "      <td>1.669225</td>\n",
       "      <td>54.357502</td>\n",
       "      <td>58.436333</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.201922</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.980752</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.198073</td>\n",
       "      <td>0.174999</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.182674</td>\n",
       "      <td>2.046147</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 19193 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Tumor_Sample_Barcode     TSPAN6      TNMD       DPM1      SCYL3  \\\n",
       "0                TCGA-EJ-5499  60.674260  0.048559  41.566360   9.286877   \n",
       "1                     MO_1012  35.076985  0.000000  31.772289   5.271777   \n",
       "2                TCGA-CH-5752   9.524737  0.046519  40.657484   9.803850   \n",
       "4                     SC_9126   9.881191  0.079595  82.233337   8.357509   \n",
       "6  PROS01448-6115227-SM-67ERU  20.259544  0.080769  53.886349  10.190349   \n",
       "\n",
       "    C1orf112       FGR        CFH      FUCA2        GCLC    ...         H2BE1  \\\n",
       "0   3.156324  2.294405   9.869583  38.640690   27.144387    ...      0.000000   \n",
       "1  12.038535  1.211722  11.487752  32.275862   18.380403    ...      0.000000   \n",
       "2   3.895955  1.860754   5.151964  65.905596   19.421624    ...      0.093038   \n",
       "4  40.195637  6.958905  17.351780  49.747076  115.811192    ...      0.000000   \n",
       "6   6.663438  0.430768   1.669225  54.357502   58.436333    ...      0.000000   \n",
       "\n",
       "   GET1-SH3BGR  SPDYE15  NOTCH2NLB  SPDYE13     ASDURF    SPEGNB  SPDYE14  \\\n",
       "0     0.376331      0.0  27.484299      0.0   3.144184  0.388471      0.0   \n",
       "1     0.000000      0.0  13.942669      0.0  19.293129  0.000000      0.0   \n",
       "2     0.174446      0.0  15.339594      0.0   5.396188  0.779191      0.0   \n",
       "4     4.434596      0.0   0.000000      0.0   4.548304  1.250784      0.0   \n",
       "6     0.201922      0.0   4.980752      0.0   1.198073  0.174999      0.0   \n",
       "\n",
       "       DERPC  NOTCH2NLC  \n",
       "0  19.241437   2.063750  \n",
       "1   0.000000   5.271777  \n",
       "2  19.910072   5.419447  \n",
       "4  22.582330   0.227415  \n",
       "6   5.182674   2.046147  \n",
       "\n",
       "[5 rows x 19193 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpm_data_with_propoer_id.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "tpm_data_with_propoer_id=tpm_data_with_propoer_id.set_index('Tumor_Sample_Barcode')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tpm_data_with_propoer_id  = tpm_data_with_propoer_id.round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TSPAN6</th>\n",
       "      <th>TNMD</th>\n",
       "      <th>DPM1</th>\n",
       "      <th>SCYL3</th>\n",
       "      <th>C1orf112</th>\n",
       "      <th>FGR</th>\n",
       "      <th>CFH</th>\n",
       "      <th>FUCA2</th>\n",
       "      <th>GCLC</th>\n",
       "      <th>NFYA</th>\n",
       "      <th>...</th>\n",
       "      <th>H2BE1</th>\n",
       "      <th>GET1-SH3BGR</th>\n",
       "      <th>SPDYE15</th>\n",
       "      <th>NOTCH2NLB</th>\n",
       "      <th>SPDYE13</th>\n",
       "      <th>ASDURF</th>\n",
       "      <th>SPEGNB</th>\n",
       "      <th>SPDYE14</th>\n",
       "      <th>DERPC</th>\n",
       "      <th>NOTCH2NLC</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tumor_Sample_Barcode</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>TCGA-EJ-5499</th>\n",
       "      <td>60.67</td>\n",
       "      <td>0.05</td>\n",
       "      <td>41.57</td>\n",
       "      <td>9.29</td>\n",
       "      <td>3.16</td>\n",
       "      <td>2.29</td>\n",
       "      <td>9.87</td>\n",
       "      <td>38.64</td>\n",
       "      <td>27.14</td>\n",
       "      <td>14.56</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27.48</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.14</td>\n",
       "      <td>0.39</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.24</td>\n",
       "      <td>2.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MO_1012</th>\n",
       "      <td>35.08</td>\n",
       "      <td>0.00</td>\n",
       "      <td>31.77</td>\n",
       "      <td>5.27</td>\n",
       "      <td>12.04</td>\n",
       "      <td>1.21</td>\n",
       "      <td>11.49</td>\n",
       "      <td>32.28</td>\n",
       "      <td>18.38</td>\n",
       "      <td>22.13</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.94</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.29</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>5.27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TCGA-CH-5752</th>\n",
       "      <td>9.52</td>\n",
       "      <td>0.05</td>\n",
       "      <td>40.66</td>\n",
       "      <td>9.80</td>\n",
       "      <td>3.90</td>\n",
       "      <td>1.86</td>\n",
       "      <td>5.15</td>\n",
       "      <td>65.91</td>\n",
       "      <td>19.42</td>\n",
       "      <td>23.03</td>\n",
       "      <td>...</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.0</td>\n",
       "      <td>15.34</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.40</td>\n",
       "      <td>0.78</td>\n",
       "      <td>0.0</td>\n",
       "      <td>19.91</td>\n",
       "      <td>5.42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SC_9126</th>\n",
       "      <td>9.88</td>\n",
       "      <td>0.08</td>\n",
       "      <td>82.23</td>\n",
       "      <td>8.36</td>\n",
       "      <td>40.20</td>\n",
       "      <td>6.96</td>\n",
       "      <td>17.35</td>\n",
       "      <td>49.75</td>\n",
       "      <td>115.81</td>\n",
       "      <td>34.24</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>4.43</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.55</td>\n",
       "      <td>1.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>22.58</td>\n",
       "      <td>0.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PROS01448-6115227-SM-67ERU</th>\n",
       "      <td>20.26</td>\n",
       "      <td>0.08</td>\n",
       "      <td>53.89</td>\n",
       "      <td>10.19</td>\n",
       "      <td>6.66</td>\n",
       "      <td>0.43</td>\n",
       "      <td>1.67</td>\n",
       "      <td>54.36</td>\n",
       "      <td>58.44</td>\n",
       "      <td>32.36</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.98</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.20</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.18</td>\n",
       "      <td>2.05</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 19192 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            TSPAN6  TNMD   DPM1  SCYL3  C1orf112   FGR    CFH  \\\n",
       "Tumor_Sample_Barcode                                                            \n",
       "TCGA-EJ-5499                 60.67  0.05  41.57   9.29      3.16  2.29   9.87   \n",
       "MO_1012                      35.08  0.00  31.77   5.27     12.04  1.21  11.49   \n",
       "TCGA-CH-5752                  9.52  0.05  40.66   9.80      3.90  1.86   5.15   \n",
       "SC_9126                       9.88  0.08  82.23   8.36     40.20  6.96  17.35   \n",
       "PROS01448-6115227-SM-67ERU   20.26  0.08  53.89  10.19      6.66  0.43   1.67   \n",
       "\n",
       "                            FUCA2    GCLC   NFYA    ...      H2BE1  \\\n",
       "Tumor_Sample_Barcode                                ...              \n",
       "TCGA-EJ-5499                38.64   27.14  14.56    ...       0.00   \n",
       "MO_1012                     32.28   18.38  22.13    ...       0.00   \n",
       "TCGA-CH-5752                65.91   19.42  23.03    ...       0.09   \n",
       "SC_9126                     49.75  115.81  34.24    ...       0.00   \n",
       "PROS01448-6115227-SM-67ERU  54.36   58.44  32.36    ...       0.00   \n",
       "\n",
       "                            GET1-SH3BGR  SPDYE15  NOTCH2NLB  SPDYE13  ASDURF  \\\n",
       "Tumor_Sample_Barcode                                                           \n",
       "TCGA-EJ-5499                       0.38      0.0      27.48      0.0    3.14   \n",
       "MO_1012                            0.00      0.0      13.94      0.0   19.29   \n",
       "TCGA-CH-5752                       0.17      0.0      15.34      0.0    5.40   \n",
       "SC_9126                            4.43      0.0       0.00      0.0    4.55   \n",
       "PROS01448-6115227-SM-67ERU         0.20      0.0       4.98      0.0    1.20   \n",
       "\n",
       "                            SPEGNB  SPDYE14  DERPC  NOTCH2NLC  \n",
       "Tumor_Sample_Barcode                                           \n",
       "TCGA-EJ-5499                  0.39      0.0  19.24       2.06  \n",
       "MO_1012                       0.00      0.0   0.00       5.27  \n",
       "TCGA-CH-5752                  0.78      0.0  19.91       5.42  \n",
       "SC_9126                       1.25      0.0  22.58       0.23  \n",
       "PROS01448-6115227-SM-67ERU    0.17      0.0   5.18       2.05  \n",
       "\n",
       "[5 rows x 19192 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpm_data_with_propoer_id.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = join(PROSTATE_DATA_PATH, 'processed/P1000_data_tpm.csv')\n",
    "tpm_data_with_propoer_id.to_csv(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(659, 19192)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpm_data_with_propoer_id.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:min_env]",
   "language": "python",
   "name": "conda-env-min_env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
